import pandas as pd
import numpy as np
import os
import sys
import seaborn as sns
import matplotlib
from matplotlib import pyplot
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from scipy import stats
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor as rfr
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
if not sys.warnoptions:
import warnings
warnings.simplefilter("ignore")
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#giving path to dataset
data = pd.read_csv("vehicles.csv")
data.head()#top 5 rows in the dataset
data.info()#summary of the dataset features and values
# checking for missing values
data.isnull().sum()
Before we move on, we can see from the results that there are some features that have really low or zero representation in the data and there are some we don't need in for our analysis like id column, url etc.
So we would drop them.
df_car = data.drop(['id','url','region_url','county',"description", 'image_url','title_status','vin'], axis = 1)#dropping unecessary features
Again as we earlier noted most of the variables are categorical and the numerical ones have little or no missing values. It would be good examine the categories so we can decide how to deal with the missing data.
categoricalColumn = ['region', 'year', 'manufacturer', 'model',
'condition','fuel','size','paint_color','state','transmission','drive','type']
for col in categoricalColumn:
print('Value Counts for ' + col)
print(df_car[col].value_counts())
print('')
df_cars = df_car.dropna() #creating a new dataframe without the na values
df_cars.isnull().sum()
df_cars.info()
As seen above the new dataframe is without any missing values
Why remove the missing data?
First we have a lot more categorical variables than numerical, it may not be wise to fill the missing parts of the data by imputation because the data would be skewed. Although, we lost a lot of data, we still have a lot with which we can do how analysis.
# convert year column from float to int
df_cars['year']=df_cars['year'].astype(int)
df_cars.info()
The next thing in our analysis is to examine the data a little bit more. After this we would do a little more clearing and preparation of the data before analysis. For now, we need the names in the categorical data for better visualization.
This examination is to make more sense of the data we have using graphs and smaller tables etc,.
df_cars['price'].value_counts()
df_cars.price.describe()#five-number summary of the price feature
# We would plot an histogram to see how varied the prices are
plt.hist(df_cars["price"])
So the prices are really on the higher side, we would bring it down to thousands just to see the skewness, peak etc before we standardize or normalize the data
sns.boxplot(y='price', data=df_cars)
plt.figure(figsize=(3,6))
sns.boxplot(y='price', data=df_cars,showfliers=False)#visualising the data without the outliers skewing
sns.violinplot(y='price', data=df_cars)
From the boxplot and the violinplot we see that there are a lot of outliers in the data so the data is extremely skewed. We can see that also from the describe function. The maximum value of cars sold runs in billions.
Lets look into it more closely.
# Cars with the 20 highest values
top15 = df_cars.nlargest(20,"price")
top15.price
From the above results above we can see that:
# Cars with the 20 lowest prices
least15 = df_cars.nsmallest(20,"price")
least15.price
df = df_cars[['manufacturer','price']][df_cars.price==df_cars['price'].min()]#the lowest prices of certain manufactures
df.head()
4657 of 106,000 rows that we have have cars priced at zeros and 1755 priced at ones so we would remove them and see if there are other awkward cases so we know how to treat them.
# removing columns with prices at zero
df_cars.drop(df_cars[df_cars['price'] == 0 ].index,inplace = True)
# removing columns with prices at one
df_cars.drop(df_cars[df_cars['price'] == 1 ].index,inplace = True)
# Cars with the lowest value
least15 = df_cars.nsmallest(1554,"price")
least15.price
We still have some unusual cases we need to deal with. What we intend to do is limit the data to prices that are less than 3 standard deviations from the mean.
#outliers USD_goal
outliers = df_cars[(np.abs(stats.zscore(df_cars['price'])) >= 3 )]
outliers.sort_values('price',inplace=True)
print("Total Outlier observations :")
print(outliers.shape)
p_out_usd = round( ((len(outliers)/len(df_cars))*100 ) , 2)
print("Proportion of Outliers : %s %%\n" % p_out_usd)
outliers_sub = df_cars[(np.abs(stats.zscore(df_cars['price'])) >=3 )]
df_cars = df_cars[ ~df_cars.index.isin(outliers_sub.index) ]
print("Observations after removing outliers (price) : ")
print(df_cars.shape)
Using the z score didn't remove to the outliers. From what we observed above the prices after removal where at most 2 million dollars, even after that there are just a few that are priced at a million dollars so it might be good to remove that too. Also looking at the least prices we have cars for 3,4,5 dollars and some other unreasonable prices which are reported to be new or in excellent condition. We intend to do is create lower and upper bounds of 1000 dollar to 150000 dollars respectively.
df_cars = df_cars[df_cars['price'] > 1000] #creating lower and upper bounds
df_cars = df_cars[df_cars['price'] < 150000]#now df_cars only have pricevalues within this limit
print('Min:' ,min(df_cars.price),'Max:',max(df_cars.price))
sns.distplot(df_cars.year)#visualising the year distribution density
Our data has cars that date back to below 1990, we intend to remove them to improve price predictions as the data is more concentrated on the last 30 years
(df_cars['year'] <1990).sum()#total values under 1990
Only 2137 observations below 1990.That is only a minute value compared to the total observations.Hence we can remove them
df_cars1 = df_cars[(df_cars['year'] >1990)]
df_cars1.info()#info about cars listed after 1990
# plotting the data from 1990 and above
plt.figure(figsize=(15,9))
ax = sns.countplot(x='year',data=df_cars1,palette = 'GnBu_d');
ax.set_ylabel('Cars Sold',fontsize=15)
ax.set_title('Number of cars sold in last 30 years',fontsize=15)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right",fontsize=10);
The data shows a dip at the 2009-2010 period which could be attributed to the global slump in the car sales as a result of the recession
df_cars1['odometer'].describe() #summary of odometer feature
(df_cars1['odometer']==0).sum()#cars with zero odometer reading
#Odometer tyical values and delete outliers
sns.boxplot(y='odometer', data=df_cars1)
The data is heavily skewed as observed in the boxplot
#Visualizing using a densityplot
x = df_cars1['odometer']
sns.distplot(x, kde=False)
df_cars1 = df_cars1[(df_cars1['odometer'] >0)]#removing all the zero valued odometer readings
df_odometerlog = np.log10(df_cars1['odometer'])#visualizing the distribution after log transformation
x = df_odometerlog
sns.distplot(x, kde=True);
The distribution is more normalized now
df_odometerlog = pd.DataFrame(df_odometerlog)
df_odometerlog.head()#changing log transformed data as a dataframe and visualizing the top values
df_cars1['odometer'] = df_odometerlog#mapping to the originaldataframe
df_cars1['odometer'].head()
Transforming the cylinder values with values ranging from 0-12
cylinder_num = {"cylinders": {"3 cylinders":3,"4 cylinders": 4, "5 cylinders": 5, "6 cylinders": 6,"8 cylinders": 8,
"10 cylinders" :10,"12 cylinders": 12,"other" :0 }}
cylinder_num
df_cars1.replace(cylinder_num, inplace=True)
df_cars1['cylinders'].value_counts()
df_cars1['condition'].unique()#unique values
#transforming the ratings of car according to the standard values in Kelly blue book website
df_cars1['condition'] = df_cars1["condition"].replace('like new', "fair")
df_cars1['condition'] = df_cars1["condition"].replace('new', "fair")
df_cars1['condition'] = df_cars1["condition"].replace('salvage', "poor")
df_cars1['condition'].value_counts()
df_cars1.head()
df_cars1['fuel'].value_counts() #information on fuel types
# Display the missing values
plt.figure(figsize=(12,12))
plt.title("Missing values for each column")
sns.heatmap(data.isnull())
plt.show()
#plotting the manufacturers using bar plots
manufacturers_top10 = df_cars1['manufacturer'].value_counts().iloc[:10]
manufacturers = pd.DataFrame({'manufacturer': manufacturers_top10.index, 'count': manufacturers_top10.values})
plt.figure(figsize=(15,10))
ax = sns.barplot(y='manufacturer',x='count',data=manufacturers, order=manufacturers['manufacturer'],palette="cubehelix");
ax.set_yticklabels(ax.get_yticklabels(),rotation=90, ha="right",fontsize=10);
Ford is the largest manufacturer of cars in USA
#Plotting top car production years
plt.figure(figsize=(15,10))
years_top10 = df_cars1['year'].value_counts().iloc[:10]
years = pd.DataFrame({'year': years_top10.index, 'count': years_top10.values})
ax = sns.barplot(x='year', y='count', data=years, order = years['year']);
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="right",fontsize=8);
plt.title("Top 10 years car production",size = 25);
fig = plt.figure(figsize = (20,20))
2013 is the year with highest number of cars produced
#Plotting the condition of the vehicles
cond_top = df_cars1['condition'].value_counts()
condition = pd.DataFrame({'Condition': cond_top.index, 'No.of vehicles': cond_top.values})
plt.figure(figsize=(15,8))
plt.title('Condition of the vehicles',size =25)
ax = sns.barplot(y='Condition',x='No.of vehicles',data=condition, order=condition['Condition'],palette='YlGnBu');
ax.set_yticklabels(ax.get_yticklabels(),fontsize=20);
Majority vehicles are in excellent condition
#Plotting fuel type against no of ehicles
fueltype = df_cars1['fuel'].value_counts()
fuel = pd.DataFrame({'Fuel': fueltype.index, 'count': fueltype.values})
plt.figure(figsize=(15,8))
plt.title('Fuel type Vs No.of vehicles',size =25)
ax = sns.barplot(x='Fuel',y='count',data=fuel, order=fuel['Fuel'],palette="YlGnBu");
ax.set_xticklabels(ax.get_xticklabels(), ha="right",fontsize=20);
Vehicles running on gasoline takes up the lions share of the market
plt.figure (figsize = (12,5))
plt.ylabel ('Price',size =15)
plt.xlabel ('# of Cylinders',size=15)
plt.title('Price Variation of Cylinders',size=25)
plt.xticks(size=10)
plt.yticks(size=10)
Cylin_df = df_cars1.groupby ('cylinders')['price'].count()
plt.plot(Cylin_df)
plt.show()
plt.figure (figsize = (12,5))
plt.ylabel ('Price',size =15)
plt.xlabel ('Transmission Type',size=15)
plt.title('Price Variation of Transmission Type',size=25)
plt.xticks(size=10)
plt.yticks(size=10)
Cylin_df = df_cars1.groupby ('transmission')['price'].count()
plt.plot(Cylin_df)
plt.show()
plt.figure (figsize = (15,5))
plt.ylabel ('Price',size =15)
plt.xlabel ('Type',size=15)
plt.title('Price Variation Vs Type of vehicle',size=25)
plt.xticks(size=10)
plt.yticks(size=10)
Cylin_df = df_cars1.groupby ('type')['price'].count()
plt.plot(Cylin_df)
plt.show()
plt.figure (figsize = (20,5))
plt.ylabel ('Price',size =15)
plt.xlabel ('Manufacturer',size=15)
plt.title('Price Variation Vs Manufacturer',size=25)
plt.xticks(size=10)
plt.yticks(size=10)
plt.xticks(rotation=45)
Cylin_df = df_cars1.groupby ('manufacturer')['price'].count()
plt.plot(Cylin_df)
plt.show()
pricecar = df_cars1[['year','price']]
pricechange = pricecar[pricecar['year']>=1990]
plt.scatter(pricechange['year'], pricechange['price'])
plt.title('year vs price ')
plt.xlabel('year')
plt.ylabel('Price Billion $')
plt.show()
plt.figure(figsize=(10,20))
labels = pd.DataFrame(df_cars["condition"].value_counts())
plt.pie(df_cars["condition"].value_counts(), labels = labels.index, autopct='%.2f')
plt.show()
plt.figure(figsize=(10,20))
labels = pd.DataFrame(df_cars["fuel"].value_counts())
plt.pie(df_cars["fuel"].value_counts(), labels = labels.index, autopct='%.2f')
plt.show()
plt.figure(figsize=(10,20))
labels = pd.DataFrame(df_cars["transmission"].value_counts())
plt.pie(df_cars["transmission"].value_counts(), labels = labels.index, autopct='%.2f')
plt.show()
gasLabels = data[data["fuel"]=="gas"].paint_color.value_counts().head(50).index
gasValues = data[data["fuel"]=="gas"].paint_color.value_counts().head(50).values
dieselLabels = data[data["fuel"]=="diesel"].paint_color.value_counts().head(50).index
dieselValues = data[data["fuel"]=="diesel"].paint_color.value_counts().head(50).values
electricLabels = data[data["fuel"]=="electric"].paint_color.value_counts().head(50).index
electricValues = data[data["fuel"]=="electric"].paint_color.value_counts().head(50).values
from plotly.subplots import make_subplots
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=gasLabels, values=gasValues, name="Gas Car"),
1, 1)
fig.add_trace(go.Pie(labels=dieselLabels, values=dieselValues, name="Diesel Car"),
1, 2)
fig.add_trace(go.Pie(labels=electricLabels, values=electricValues, name="Electric Car"),
1, 3)
fig.show()
cylindersframe = pd.DataFrame({"Cylinders":data.cylinders.value_counts().index,"Car_cylinders":data.cylinders.value_counts().values})
cylindersframe["Cylinders"] = cylindersframe["Cylinders"].apply(lambda x : "" + str(x))
cylindersframe.set_index("Cylinders",inplace=True)
p1 = [go.Pie(labels = cylindersframe.index,values = cylindersframe.Car_cylinders,hoverinfo="percent+label+value",hole=0.1,marker=dict(line=dict(color="#000000",width=2)))]
layout4 = go.Layout(title="Cylinders Pie Chart")
fig4 = go.Figure(data=p1,layout=layout4)
iplot(fig4)
data=data.sort_values(by=['odometer'],ascending=False)
plt.figure(figsize=(25,15))
sns.barplot(x=data.manufacturer, y=data.odometer)
plt.xticks(rotation= 90)
plt.xlabel('Manufacturer',fontsize = 25)
plt.ylabel('Odometer',fontsize = 25)
plt.show()
plt.figure(figsize=(25,15))
sns.barplot(x=data.manufacturer , y=data.price)
plt.xticks(rotation= 90)
plt.xlabel('Manufacturer')
plt.ylabel('Price')
plt.show()
plt.figure(figsize= (20,15))
plt.xlabel('Vehicle type',fontsize = 20)
plt.ylabel('Count',fontsize = 20)
plt.title("Types of vehicle",fontsize =20)
data.type.value_counts(dropna=False).plot(kind = "bar")
plt.show()
statecount = df_cars1.state.value_counts()
statecount.index = statecount.index.map(str.upper)
datamap = dict(type='choropleth',
colorscale = 'Reds',
locations = statecount.index,
z = statecount,
locationmode = 'USA-states',
marker = dict(line = dict(color = 'rgb(255,255,255)',width = 2)),
colorbar = {'title':"Cars listed per State"}
)
layout = dict(title = 'Cars listed per State',
geo = dict(scope='usa',
showlakes = True,
lakecolor = 'rgb(85,173,240)')
)
choromap = go.Figure(data = [datamap],layout = layout)
iplot(choromap)
As we can see from the visualization above California has the highest number of cars sales followed by Texas
Now we would convert the categorical variables to integers so the machine learning model performs well.
# the columns we would be selecting for futher analysis
data_encode = df_cars1[['price','region','year','manufacturer','condition','cylinders','fuel', 'odometer', 'transmission', 'drive','size','type','paint_color','state']]
data_encode.head()#creating a dataframe with just the categorical features and ordinal features
# Determination the categorical features by sepreating them from the ordinal values
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = data_encode.columns.values.tolist()
for col in features:
if data_encode[col].dtype in numerics: continue
categorical_columns.append(col)
# Encoding categorical features
for col in categorical_columns:
if col in data_encode.columns:
le = LabelEncoder()
le.fit(list(data_encode[col].astype(str).values))
data_encode[col] = le.transform(list(data_encode[col].astype(str).values))
data_encode.head(10)#visualizing the encoded values
Now we have the data all cleaned and encoded we can start to perform multiple machine learning algorithms inorder to see which performs best`
X= data_encode.drop('price',axis = 1)#dropping the target variables
X.head()
y = data_encode.price#target variable
y.head()
X_train, X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state = 45)#splitting the data 80pc as training and 20 as testing
# fit to training set
regres = LinearRegression()#mapping the linear regression model
regres.fit(X_train, y_train)#fitting the model
y_pred1 = regres.predict(X_test)#predicting on the testing data
print('Accuracy:' ,r2_score(y_test, y_pred1))
# Errors of the regression model
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred1)))
ridge1 = RidgeCV(alphas = [0.01,0.1,0.5,1.0,10.0,100], normalize = True)#mapping the ridge regressor with alpha values
ridge1.fit(X_train, y_train) #fitting the model
y_pred = ridge1.predict(X_test) #predicting values
# best alpha
print(ridge1.alpha_)
print('Accuracy:' ,r2_score( y_test, y_pred))#printing the accuracy scores
#Errors
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
lasso = LassoCV(alphas=[0.0001,0.001,0.01,0.1,0.5,1.0])#mapping the lasso regressor with input alpha values
lasso.fit(X_train,y_train)#fitting the model
# best alpha
print(lasso.alpha_)
y_pred1 = lasso.predict(X_test)
print('Accuracy:' ,r2_score( y_test, y_pred1))
#Error values
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred1)))
Finding the best number of estimators for random forest regressionrom a range of values for parameter tuning
estimators = [10,20,30,40,50,60,70,80,100,110,120]
mean_rfrs = []
std_rfrs_upper = []
std_rfrs_lower = []
yt = [i for i in data_encode['price']] # quick pre-processing of the target
np.random.seed(11111)
for i in estimators:
model = rfr(n_estimators=i,max_depth=None)
scores_rfr = cross_val_score(model,X,y,cv=3,scoring='explained_variance')
print('estimators:',i)
# print('explained variance scores for k=10 fold validation:',scores_rfr)
print("Est. explained variance: %0.2f (+/- %0.2f)" % (scores_rfr.mean(), scores_rfr.std() * 2))
print('')
mean_rfrs.append(scores_rfr.mean())
std_rfrs_upper.append(scores_rfr.mean()+scores_rfr.std()*2) # for error plotting
std_rfrs_lower.append(scores_rfr.mean()-scores_rfr.std()*2) # for error plotting
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.plot(estimators,mean_rfrs,marker='o',
linewidth=2,markersize=10)
ax.fill_between(estimators,std_rfrs_lower,std_rfrs_upper,
facecolor='red',alpha=0.1,interpolate=True)
ax.set_ylim([0,1])
ax.set_xlim([0,250])
plt.title('Expected Variance of Random Forest Regressor')
plt.ylabel('Expected Variance')
plt.xlabel('Trees in Forest')
plt.grid()
plt.show()
From the plot above we can see that there is not much effect on varying the number of trees on the accuracy score of the model.Hence we can use either 110 or 200 trees
# with 110 trees
regressor = rfr(n_estimators = 110, random_state = 42)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
print('Accuracy:' ,r2_score( y_test, y_pred))
# Errors
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Eliminating irrelevant features helps in better performance of the model.By using backward elimination with the level of significance set at 0.01 we can filter through the features selecting only the relevant features
#Backward Elimination method for feature selection
import statsmodels.api as sm
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
p= []
X_1 = X[cols]
X_1 = sm.add_constant(X_1)
model = sm.OLS(y,X_1).fit()
p = pd.Series(model.pvalues.values[1:],index = cols)
pmax = max(p)
feature_with_p_max = p.idxmax()
if(pmax>0.01):
cols.remove(feature_with_p_max)
else:
break
selected_features_BE = cols
print(p.sort_values(ascending = False))
selected_features_BE
data_encode.head()
#selecting all the feature selected by after backward elimination
data_rfe = data_encode[['year',
'manufacturer',
'condition',
'cylinders',
'fuel',
'odometer',
'transmission',
'drive',
'size',
'type',
'paint_color',
'state','price']]
Running the random forest regressor on the selected features
X = data_rfe.drop('price',axis = 1)#dropping target variable
y = data_rfe.price
X_train, X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state = 45)#split train test
regressor = rfr(n_estimators = 100, random_state = 42)#mappin regressor
regressor.fit(X_train,y_train)#running random forest regressor without the region feature
y_pred = regressor.predict(X_test)
print('Accuracy:',r2_score( y_test, y_pred))
Backward elimination has only dropped region feature,while running the model without that feature it shows a slight accuracy increase from the previous score of 0.857641848801272 ,but not a significant rise
# fit the model
xgb = XGBRegressor(random_state =42,silent=True)#mapping xgb regressor
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print('Accuracy:',r2_score( y_test, y_pred))
This model has an accuracy score lower than that of random forest
As we can see from the different model performances we can conclude that random forest regression has the highest predictive accuracy with 85.86%
To prevent over fitting we performed 10 fold cross validation of the random forest regressor model which gave the best accuracy score
scores_regressor = cross_val_score(regressor, X, y, cv=10)
print(scores_regressor.mean())
"Est. explained variance: %0.2f (+/- %0.2f)" % (scores_regressor.mean(), scores_regressor.std() * 2)
The cv score score implies that the random forest model generalizes well and is not overfitting
The reason for the underperformance of multiple linear regression is that the data is not linear. Since random forest captures non-linear interactions between the features and the target, it works well with our dataset predicting the price of the used cars with an accuracy rate of 85.7 percent.
https://www.kaggle.com/austinreese/craigslist-carstrucks-data
https://www.geeksforgeeks.org/python-pandas-series-str-replace-to-replace-text-in-a-series/
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html
https://stackoverflow.com/questions/58318685/how-to-hide-warnings-from-xgboost-library-in-jupyter